f0d92029cfad60eee4237f020be493f34ede4e0a,KeywordSearch/src/org/sleuthkit/autopsy/keywordsearch/HtmlTextExtractor.java,HtmlTextExtractor,index,#AbstractFile#IngestJobContext#,100
Before Change
boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
if (context.fileIngestIsCancelled()) {
try {
stream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
}
try {
if (reader != null) {
reader.close();
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
}
return false;
}
totalRead += readSize;
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while ((totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
} else {
//try to read until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
}
}
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
//encode to bytes to index as byte stream
String extracted;
//add BOM and trim the 0 bytes
//set initial size to chars read + bom - try to prevent from resizing
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
//sb.append(UTF16BOM); disabled BOM, not needing as bypassing Tika
if (totalRead < MAX_EXTR_TEXT_CHARS) {
sb.append(textChunkBuf, 0, (int) totalRead);
} else {
sb.append(textChunkBuf);
}
//reset for next chunk
totalRead = 0;
extracted = sb.toString();
//converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(outCharset);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try {
chunk.index(ingester, encodedBytes, encodedBytes.length, outCharset);
++this.numChunks;
} catch (Ingester.IngesterException ingEx) {
success = false;
logger.log(Level.WARNING, "Ingester had a problem with extracted HTML from file '" //NON-NLS
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
throw ingEx; //need to rethrow/return to signal error and move on
}
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
success = false;
} catch (Exception ex) {
logger.log(Level.WARNING, "Unexpected error, can't read content stream from " + sourceFile.getId() + ": " + sourceFile.getName(), ex); //NON-NLS
success = false;
} finally {
try {
stream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content stream from " + sourceFile.getId(), ex); //NON-NLS
}
try {
if (reader != null) {
reader.close();
}
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close content reader from " + sourceFile.getId(), ex); //NON-NLS
}
}
//after all chunks, ingest the parent file without content itself, and store numChunks
if (!context.fileIngestIsCancelled()) {
ingester.ingest(this);
}
return success;
After Change
boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof && (readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS)) != -1) {
if (context.fileIngestIsCancelled()) {
ingester.ingest(this);
return true;
}
totalRead += readSize;